Simulation

  1. Explore Clustering a). Set ICC=0.3, using the function from the last homework to simulate data:
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggfortify)
library(colorRamps)
library(htmlTable)
#this function return simulated ICC data
get_data<-function(target, n, k, mu, sigmaw){
  #create label for group
  group <- factor(rep(1:k, each = n/k))
  #create simulated y value for simulated target
  sigmab<-sqrt(sigmaw^2*target/(1-target))
  value <- c(sapply(rnorm(k,mu,sigmab), function(mui) rnorm(n/k, mui, sigmaw)))#observations
  data <- data.frame(group = group, value = value)
  return(data)
}

Here, a function was developed to calculate the purity when evaluating clustering algorithm: \[purity(\omega, c)=\frac{1}{N}\sum^{k}max_{j}|\omega_k \cap c_j|\]

#this function returns purity after applying clustering algorithms
purity<-function(clusters, classes) {
        sum(apply(table(classes, clusters), 2, max)) / length(clusters)
}

Using the above two functions to evaluate the performance of k-means clustering

#set seed for simulations
set.seed(1117)
#get simulated data
dat<-replicate(50, get_data(target = 0.3, n = 100, k = 3, mu = 5, sigmaw = 0.1)$value)
dat<-as.data.frame(dat)
dat$group<-get_data(target = 0.3, n = 100, k = 3, mu = 5, sigmaw = 0.1)$group
#apply k-mean cluster analysis with k=5
fit<-kmeans(dat[,c(1:50)], 5)
# append cluster assignment
dat <- data.frame(dat, fit$cluster)
#calculate the rate of misclassified observations
rate<-1-purity(dat$fit.cluster, dat$group)

Based on the number of misclassification, we could conclude that when we applied 5-means clustering on a data with a real group of three, there are nearly 31.3% false rate.

b). Apply different ICC value from 0.1 to 0.9 and visualize the rate of purity against ICC by ggplot2:

#this function returns purity for measuring accuracy of k-mean clustering
get.purity<-function(target, n, k, kmeans, p){
       #get simulated data
       dat<-replicate(p, get_data(target, n, k, mu = 5, sigmaw = 0.1)$value)
       dat<-as.data.frame(dat)
       dat$group<- as.numeric(get_data(target, n, k, mu = 5, sigmaw = 0.1)$group)
       #apply k-mean cluster analysis with k=5
       set.seed(222)
       fit<-kmeans(dat[,c(1:p)], kmeans)
       #append cluster assignment
       dat <- data.frame(dat, fit$cluster)
       #calculate the rate of purity
       return(purity(dat$fit.cluster, dat$group))
}

#get results for varing ICC
set.seed(24)
purity_result<-sapply(seq(0.1,0.9, 0.05), function(x) get.purity(target = x, n = 100, k = 3,    
                                                         kmeans = 3, p = 50))
summary<-data.frame(ICC = seq(0.1,0.9, 0.05), purity_result)

#using ggplot2 to visualize the results
ggplot(data = summary, aes(x = ICC, y = purity_result))+
  geom_line()+
  geom_point(size=2)+
  scale_y_continuous(name='Purity')+
  geom_hline(yintercept = 1, linetype = 'dashed')+
  expand_limits(y=0)+
  theme_classic()

We found when ICC reaches to 0.2, there is perfect clustering after applying 3-means clustering on a simulated data with a group of 3.The required ICC for effective clustering would be at least 0.2.

c). Using minimum ICC as 0.2. The data with noise variables were generated as following:

get_noise_data<-function(x, p = 50, n = 100, k =3, ICC = 0.2){
  if(x!= p && x!=0){
    temp1<- as.data.frame(replicate(p-x, get_data(ICC, n, k, mu = 5, sigmaw = 
                                    0.1)$value))
    temp2<- as.data.frame(replicate(x, get_data(target = 0, n, k, mu = 5, 
                                                sigmaw = 0.1)$value))
    temp<- data.frame(group = get_data(ICC, n, k, mu = 5, sigmaw = 
                                0.1)$group , temp1, temp2)}
  #if all variables are noisy
  if(x == p){
    temp<-as.data.frame(replicate(p, get_data(target = 0, n, k, mu = 5, sigmaw 
                                              = 0.1)$value))
    temp<-data.frame(group = get_data(target = 0, n, k, mu = 5, sigmaw = 
                                  0.1)$group, temp)
  }
  #none of the variables are noisy
  if(x == 0){
    temp<-as.data.frame(replicate(p, get_data(target = ICC, n, k, mu = 5, 
                                              sigmaw = 0.1)$value))
    temp<-data.frame(group = get_data(target = ICC, n, k, mu = 5, sigmaw = 
                                  0.1)$group, temp)
  }
  names(temp)<-c("group", paste0("X", c(1:p)))
  return(temp)
}

#number of noise variables t
get.purity.v2<-function(t, k, kmeans, n = 100, p =50, icc = 0.2){
       dat<-get_noise_data(x = t, p, n, k, icc)
       set.seed(222)
       fit<-kmeans(dat[,c(2:(p+1))], kmeans)
       #append cluster assignment
       dat <- data.frame(dat, fit$cluster)
       #calculate the rate of misclassification
       return(purity(dat$fit.cluster, dat$group))
}

set.seed(47)
#summary the results for varying noise variables
sum<-sapply(seq(0, 50, 5), function(x) get.purity.v2(x, k = 3, kmeans = 3, 100, 50, 0.2))
#visualize the results by ggplot2
temp<-data.frame(Noisy = seq(0, 50, 5), Purity = sum)
ggplot(data = temp, aes(x = Noisy, y = Purity))+
  geom_line()+
  geom_point(size=2)+
  scale_y_continuous(name ='Purity')+
  scale_x_continuous(name ='Number of Noisy Variables')+
  expand_limits(y=0)+
  theme_classic()

Comments: when the number of noisy variables increases, the purity of k-means clustering is reduced especially when it’s more than the number of non-noisy variables.

d). when we control the number of noisy variables and target ICC:

set.seed(117)
ICC<-sample(seq(0.05, 0.95, 0.001),200)
noisy<-sample(seq(0, 50, 1), 200, replace = TRUE)
purity0<-mapply(function(x,y) get.purity.v2(x, 3, 3, 100, 50, y), noisy, ICC)
temp2<-data.frame(noisy, ICC, Purity = purity0)
#scale the variables of noisy variable and ICC value
temp2[,c(1:2)]<-scale(temp2[,c(1:2)])
#fit lm model to explore interaction
model<-lm(Purity~ noisy*ICC, data = temp2)
summary(model)
## 
## Call:
## lm(formula = Purity ~ noisy * ICC, data = temp2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44892 -0.05557  0.00315  0.06334  0.19338 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.892481   0.007197 124.011  < 2e-16 ***
## noisy       -0.069218   0.007219  -9.588  < 2e-16 ***
## ICC          0.119077   0.007215  16.504  < 2e-16 ***
## noisy:ICC    0.050586   0.007267   6.961 4.97e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1016 on 196 degrees of freedom
## Multiple R-squared:  0.6668, Adjusted R-squared:  0.6617 
## F-statistic: 130.7 on 3 and 196 DF,  p-value: < 2.2e-16
#visualize the results
pp<-function(n){
  set.seed(117)
  ICC<-sample(seq(0.05, 0.95, 0.001), n)
  Noisy<-sample(seq(0, 50, 1), n, replace = TRUE)
  df<-expand.grid(x = ICC, y = Noisy)
  df$Purity<-mapply(function(x,y) get.purity.v2(x, 3, 3, 100, 50, y), Noisy, ICC)
  df
}
p <- ggplot(pp(200), aes(x=x,y=y))
p <- p + geom_tile(aes(fill=Purity))+
     scale_x_continuous(name ='ICC')+
     scale_y_continuous(name ='Number of Noisy Variables')+
     scale_color_gradientn(colours=matlab.like(5))+
     theme_classic()
     
ggplotly(p)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

Based on the ANOVA table, we could conclude that purity by k-means clustering is negatively correlated with the number of noisy variables and positively correlated with ICC value. In addition, we observed significant interaction between number of noisy variables. According to the plot, we found more low purity occurrs when ICC is low and the number of noisy variables is large.

2.PCA and clustering When ICC=0.5, it’s likely to uncover clusters. Biplot of first two PCs were shown as below:

From the scree plot, we find the first two PCs are meaningful.

b). When the k=4, based on the scree plot, we find the meaningful PCs are the first three. From the biplot, we found there were some overlapping points in group 2 and 4. The group separation by PC1 is not as good as the one when k=3. Thus, we conclude that when the number of “true group” increases, there is more need for meaningful PCs to recapture the orignial structure of the data.

Working with Data

  1. Exploring the data a). Descriptive statistics was shown as below:
#set working directory
library(xlsx)
## Loading required package: rJava
## Loading required package: xlsxjars
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:plotly':
## 
##     subplot
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library(cluster)
library(fpc)
library(lattice)
setwd("C:/Users/bitga/Downloads/")
mice_dat<-read.xlsx("Data_Cortex_Nuclear.xls", 1, header = TRUE)
mice_dat$MouseID<-as.character(mice_dat$MouseID)
describe(mice_dat[,c(2:78)])
## mice_dat[, c(2:78)] 
## 
##  77  Variables      1080  Observations
## ---------------------------------------------------------------------------
## DYRK1A_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.4258   0.2083   0.2241   0.2486 
##      .25      .50      .75      .90      .95 
##   0.2881   0.3664   0.4877   0.6447   0.7693 
## 
## lowest : 0.1453265 0.1568492 0.1633245 0.1643295 0.1684934
## highest: 2.2018314 2.3358022 2.4599481 2.4803157 2.5163674
## ---------------------------------------------------------------------------
## ITSN1_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1076        1   0.6171   0.2276   0.3717   0.4010 
##      .25      .50      .75      .90      .95 
##   0.4734   0.5658   0.6980   0.8631   0.9899 
## 
## lowest : 0.2453585 0.2611850 0.2640852 0.2753054 0.2863181
## highest: 2.4886839 2.5194637 2.5390242 2.5801038 2.6026621
## ---------------------------------------------------------------------------
## BDNF_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.3191  0.05521   0.2428   0.2563 
##      .25      .50      .75      .90      .95 
##   0.2874   0.3166   0.3482   0.3849   0.4044 
## 
## lowest : 0.1151814 0.1859795 0.1941596 0.1972707 0.1981587
## highest: 0.4648484 0.4649639 0.4666728 0.4700556 0.4971599
## ---------------------------------------------------------------------------
## NR1_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1    2.297   0.3927    1.726    1.867 
##      .25      .50      .75      .90      .95 
##    2.057    2.297    2.528    2.732    2.890 
## 
## lowest : 1.330831 1.346827 1.403329 1.414914 1.440000
## highest: 3.147747 3.164237 3.174743 3.337822 3.757641
## ---------------------------------------------------------------------------
## NR2A_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1    3.844    1.047    2.510    2.726 
##      .25      .50      .75      .90      .95 
##    3.156    3.761    4.440    5.119    5.537 
## 
## lowest : 1.737540 1.794716 1.814940 1.836636 1.860128
## highest: 6.492660 6.766150 7.031754 7.120759 8.482553
## ---------------------------------------------------------------------------
## pAKT_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1076        1   0.2332  0.04534   0.1748   0.1853 
##      .25      .50      .75      .90      .95 
##   0.2058   0.2312   0.2573   0.2841   0.3016 
## 
## lowest : 0.06323601 0.07548701 0.07946370 0.08635097 0.12099984
## highest: 0.35997415 0.36971299 0.39228530 0.43462381 0.53905013
## ---------------------------------------------------------------------------
## pBRAF_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1075        1   0.1818  0.02968   0.1413   0.1479 
##      .25      .50      .75      .90      .95 
##   0.1646   0.1823   0.1974   0.2140   0.2257 
## 
## lowest : 0.06404259 0.06461039 0.06785481 0.09800699 0.10764925
## highest: 0.25964912 0.27419355 0.27946458 0.31295488 0.31706559
## ---------------------------------------------------------------------------
## pCAMKII_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1    3.537    1.466    1.854    2.043 
##      .25      .50      .75      .90      .95 
##    2.480    3.327    4.482    5.384    5.938 
## 
## lowest : 1.343998 1.369898 1.393799 1.402336 1.414933
## highest: 6.947290 6.951164 7.021525 7.104642 7.464070
## ---------------------------------------------------------------------------
## pCREB_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.2126  0.03675   0.1602   0.1713 
##      .25      .50      .75      .90      .95 
##   0.1908   0.2106   0.2346   0.2547   0.2672 
## 
## lowest : 0.1128118 0.1274008 0.1274832 0.1327651 0.1348824
## highest: 0.3024105 0.3045564 0.3051907 0.3057732 0.3062472
## ---------------------------------------------------------------------------
## pELK_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1    1.429   0.4031   0.9166   1.0280 
##      .25      .50      .75      .90      .95 
##   1.2037   1.3558   1.5613   1.7776   2.0011 
## 
## lowest : 0.4290323 0.7207668 0.7342256 0.7550437 0.7701149
## highest: 4.4973374 4.6878975 4.8579761 4.9426036 6.1133475
## ---------------------------------------------------------------------------
## pERK_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.5459    0.302   0.2650   0.2955 
##      .25      .50      .75      .90      .95 
##   0.3374   0.4436   0.6633   0.8854   1.0370 
## 
## lowest : 0.1491552 0.1609499 0.1795298 0.1897619 0.1912208
## highest: 2.8068632 2.9163966 3.0645281 3.2892735 3.5666854
## ---------------------------------------------------------------------------
## pJNK_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1076        1   0.3135    0.057   0.2230   0.2455 
##      .25      .50      .75      .90      .95 
##   0.2812   0.3213   0.3487   0.3726   0.3899 
## 
## lowest : 0.05211039 0.05565414 0.05706344 0.07934163 0.08177522
## highest: 0.42277359 0.42427943 0.44455213 0.46906841 0.49342586
## ---------------------------------------------------------------------------
## PKCA_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.3179  0.05912   0.2352   0.2542 
##      .25      .50      .75      .90      .95 
##   0.2818   0.3130   0.3523   0.3915   0.4100 
## 
## lowest : 0.1914307 0.1947279 0.1963828 0.1979837 0.2026424
## highest: 0.4534500 0.4603215 0.4628048 0.4638056 0.4739920
## ---------------------------------------------------------------------------
## pMEK_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1    0.275  0.05127   0.2001   0.2151 
##      .25      .50      .75      .90      .95 
##   0.2443   0.2774   0.3034   0.3306   0.3507 
## 
## lowest : 0.05681818 0.06968866 0.10595160 0.14645522 0.15120782
## highest: 0.41694421 0.42176128 0.42388451 0.43827611 0.45800055
## ---------------------------------------------------------------------------
## pNR1_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.8258   0.1322   0.6448   0.6843 
##      .25      .50      .75      .90      .95 
##   0.7435   0.8211   0.8985   0.9795   1.0243 
## 
## lowest : 0.5001597 0.5034538 0.5167739 0.5186909 0.5208758
## highest: 1.1395349 1.1609035 1.1852789 1.2447215 1.4081688
## ---------------------------------------------------------------------------
## pNR2A_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.7269   0.2125   0.4375   0.4903 
##      .25      .50      .75      .90      .95 
##   0.5903   0.7196   0.8486   0.9796   1.0567 
## 
## lowest : 0.2812848 0.3058339 0.3071190 0.3127208 0.3175686
## highest: 1.2912586 1.2942517 1.3393715 1.3599832 1.4127502
## ---------------------------------------------------------------------------
## pNR2B_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1    1.562    0.303    1.129    1.225 
##      .25      .50      .75      .90      .95 
##    1.381    1.564    1.749    1.912    1.987 
## 
## lowest : 0.3016086 0.3144841 0.3374778 0.8119808 0.8282731
## highest: 2.2385174 2.2455228 2.2956872 2.3752286 2.7239654
## ---------------------------------------------------------------------------
## pPKCAB_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1    1.525   0.5381   0.9056   1.0072 
##      .25      .50      .75      .90      .95 
##   1.1683   1.3657   1.8859   2.2623   2.4003 
## 
## lowest : 0.5678405 0.5989071 0.6461791 0.6560623 0.6581806
## highest: 2.8253239 2.8267220 2.8409339 2.9324093 3.0613871
## ---------------------------------------------------------------------------
## pRSK_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.4428  0.07282   0.3380   0.3657 
##      .25      .50      .75      .90      .95 
##   0.4041   0.4406   0.4821   0.5290   0.5535 
## 
## lowest : 0.09594156 0.09743507 0.10725965 0.15978078 0.15994936
## highest: 0.62896711 0.63233602 0.63261819 0.63360740 0.65096181
## ---------------------------------------------------------------------------
## AKT_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.6822   0.1412   0.4819   0.5394 
##      .25      .50      .75      .90      .95 
##   0.5968   0.6825   0.7597   0.8476   0.8929 
## 
## lowest : 0.06442119 0.06444805 0.07114051 0.31888227 0.33450120
## highest: 1.03657523 1.04282188 1.05856833 1.06077147 1.18217474
## ---------------------------------------------------------------------------
## BRAF_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.3785   0.1749   0.2144   0.2337 
##      .25      .50      .75      .90      .95 
##   0.2643   0.3267   0.4136   0.5841   0.6622 
## 
## lowest : 0.1438936 0.1462868 0.1528569 0.1545428 0.1578467
## highest: 1.9403759 1.9954571 1.9992518 2.0888322 2.1334157
## ---------------------------------------------------------------------------
## CAMKII_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.3634  0.05789   0.2801   0.2994 
##      .25      .50      .75      .90      .95 
##   0.3309   0.3603   0.3939   0.4303   0.4540 
## 
## lowest : 0.2129595 0.2222004 0.2324005 0.2352753 0.2356193
## highest: 0.5319950 0.5513820 0.5635500 0.5740181 0.5862445
## ---------------------------------------------------------------------------
## CREB_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1073        1   0.1805  0.02919   0.1418   0.1480 
##      .25      .50      .75      .90      .95 
##   0.1618   0.1796   0.1957   0.2128   0.2249 
## 
## lowest : 0.1136364 0.1155045 0.1178280 0.1198825 0.1227477
## highest: 0.2752204 0.2752387 0.2754036 0.2867540 0.3195582
## ---------------------------------------------------------------------------
## ELK_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1062       18     1062        1    1.173   0.3547   0.7749   0.8478 
##      .25      .50      .75      .90      .95 
##   0.9444   1.0962   1.3236   1.6568   1.8402 
## 
## lowest : 0.4976950 0.5740672 0.5978463 0.6022750 0.6045541
## highest: 2.5244799 2.5713606 2.5782326 2.6580195 2.8029483
## ---------------------------------------------------------------------------
## ERK_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1    2.474   0.7295    1.550    1.684 
##      .25      .50      .75      .90      .95 
##    1.992    2.401    2.873    3.422    3.640 
## 
## lowest : 1.131796 1.169514 1.174350 1.209362 1.238674
## highest: 4.447951 4.643823 4.804018 4.892803 5.198404
## ---------------------------------------------------------------------------
## GSK3B_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1    1.173   0.2605   0.8102   0.8866 
##      .25      .50      .75      .90      .95 
##   1.0231   1.1598   1.3097   1.4441   1.5159 
## 
## lowest : 0.1511243 0.5373899 0.5748255 0.5902113 0.6048699
## highest: 2.3230113 2.3683565 2.4377254 2.4401444 2.4757512
## ---------------------------------------------------------------------------
## JNK_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.2416  0.03658   0.1868   0.2005 
##      .25      .50      .75      .90      .95 
##   0.2204   0.2449   0.2633   0.2804   0.2881 
## 
## lowest : 0.04629779 0.05584416 0.06245912 0.06457058 0.06597808
## highest: 0.32198500 0.32400403 0.33420094 0.36140483 0.38719068
## ---------------------------------------------------------------------------
## MEK_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1073        7     1072        1   0.2728  0.04623   0.2004   0.2180 
##      .25      .50      .75      .90      .95 
##   0.2471   0.2734   0.3008   0.3241   0.3379 
## 
## lowest : 0.1472015 0.1501014 0.1546003 0.1632522 0.1652432
## highest: 0.3828926 0.3833505 0.3907032 0.3966399 0.4154079
## ---------------------------------------------------------------------------
## TRKA_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1075        1   0.6932   0.1356   0.4845   0.5278 
##      .25      .50      .75      .90      .95 
##   0.6171   0.7050   0.7742   0.8417   0.8829 
## 
## lowest : 0.1987434 0.2104558 0.2220249 0.3525928 0.3624257
## highest: 0.9567257 0.9598168 0.9795088 0.9800098 1.0016229
## ---------------------------------------------------------------------------
## RSK_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1074        1   0.1684  0.03118   0.1254   0.1335 
##      .25      .50      .75      .90      .95 
##   0.1496   0.1667   0.1845   0.2046   0.2185 
## 
## lowest : 0.1073944 0.1117441 0.1128538 0.1136638 0.1137255
## highest: 0.2699050 0.2758370 0.2801814 0.2814722 0.3051360
## ---------------------------------------------------------------------------
## APP_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.4048  0.06861   0.3080   0.3281 
##      .25      .50      .75      .90      .95 
##   0.3663   0.4020   0.4419   0.4880   0.5111 
## 
## lowest : 0.2355954 0.2423562 0.2478524 0.2553626 0.2560951
## highest: 0.5735264 0.5900151 0.5906473 0.6160290 0.6326627
## ---------------------------------------------------------------------------
## Bcatenin_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1062       18     1062        1    2.147   0.4936    1.482    1.597 
##      .25      .50      .75      .90      .95 
##    1.827    2.115    2.424    2.734    2.948 
## 
## lowest : 1.134886 1.173540 1.179566 1.185019 1.202827
## highest: 3.253195 3.266385 3.291441 3.299918 3.680552
## ---------------------------------------------------------------------------
## SOD1_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.5426   0.2961   0.2705   0.2866 
##      .25      .50      .75      .90      .95 
##   0.3196   0.4441   0.6958   0.9358   1.0757 
## 
## lowest : 0.2171202 0.2303602 0.2339398 0.2339517 0.2398032
## highest: 1.5919598 1.6105212 1.7144296 1.8620317 1.8728985
## ---------------------------------------------------------------------------
## MTOR_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.4525  0.07304   0.3498   0.3731 
##      .25      .50      .75      .90      .95 
##   0.4104   0.4525   0.4880   0.5392   0.5663 
## 
## lowest : 0.2011434 0.2085514 0.2169581 0.2400371 0.2426975
## highest: 0.6277325 0.6457956 0.6468241 0.6515966 0.6767480
## ---------------------------------------------------------------------------
## P38_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1075        1   0.4153  0.09833   0.2890   0.3077 
##      .25      .50      .75      .90      .95 
##   0.3520   0.4078   0.4663   0.5235   0.5804 
## 
## lowest : 0.2278804 0.2360062 0.2365752 0.2375201 0.2423227
## highest: 0.7219355 0.7530089 0.7656140 0.7674141 0.9332563
## ---------------------------------------------------------------------------
## pMTOR_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1    0.759   0.1331   0.5829   0.6208 
##      .25      .50      .75      .90      .95 
##   0.6835   0.7608   0.8415   0.9067   0.9463 
## 
## lowest : 0.1665787 0.1811408 0.1918159 0.2088957 0.2132967
## highest: 1.0525444 1.0582724 1.0780362 1.1053106 1.1248834
## ---------------------------------------------------------------------------
## DSCR1_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.5852   0.1069   0.4477   0.4796 
##      .25      .50      .75      .90      .95 
##   0.5309   0.5767   0.6344   0.7103   0.7671 
## 
## lowest : 0.1553210 0.1765216 0.1781864 0.1855825 0.1978178
## highest: 0.8884452 0.8928134 0.8947796 0.8949178 0.9164295
## ---------------------------------------------------------------------------
## AMPKA_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1075        1   0.3684   0.0687   0.2847   0.3001 
##      .25      .50      .75      .90      .95 
##   0.3266   0.3585   0.4008   0.4555   0.4838 
## 
## lowest : 0.2264087 0.2326615 0.2346786 0.2366105 0.2367573
## highest: 0.5641799 0.5707633 0.5984155 0.6149626 0.7008385
## ---------------------------------------------------------------------------
## NR2B_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.5653  0.09556   0.4340   0.4686 
##      .25      .50      .75      .90      .95 
##   0.5149   0.5635   0.6145   0.6794   0.7115 
## 
## lowest : 0.1847845 0.2039386 0.2042758 0.2066598 0.2210465
## highest: 0.8100804 0.8264809 0.8843568 0.9298017 0.9720198
## ---------------------------------------------------------------------------
## pNUMB_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.3571  0.06999   0.2694   0.2823 
##      .25      .50      .75      .90      .95 
##   0.3128   0.3474   0.3927   0.4457   0.4760 
## 
## lowest : 0.1855976 0.2116105 0.2174155 0.2222222 0.2247272
## highest: 0.5624407 0.5722211 0.5814696 0.5956775 0.6310522
## ---------------------------------------------------------------------------
## RAPTOR_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1077        1   0.3158  0.05997   0.2425   0.2559 
##      .25      .50      .75      .90      .95 
##   0.2761   0.3049   0.3473   0.3938   0.4204 
## 
## lowest : 0.1948245 0.2070140 0.2159514 0.2184738 0.2208304
## highest: 0.5016442 0.5042681 0.5089263 0.5089783 0.5266814
## ---------------------------------------------------------------------------
## TIAM1_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1075        1   0.4186  0.07391   0.3274   0.3422 
##      .25      .50      .75      .90      .95 
##   0.3720   0.4072   0.4560   0.5094   0.5456 
## 
## lowest : 0.2377771 0.2592782 0.2704333 0.2800111 0.2832882
## highest: 0.6554954 0.6608612 0.6971869 0.7146080 0.7221216
## ---------------------------------------------------------------------------
## pP70S6_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1077        3     1076        1   0.3945    0.171   0.1728   0.1998 
##      .25      .50      .75      .90      .95 
##   0.2811   0.3777   0.4811   0.5851   0.6770 
## 
## lowest : 0.1311198 0.1345354 0.1355658 0.1398453 0.1403043
## highest: 1.0100939 1.0482345 1.0854010 1.0872595 1.1291715
## ---------------------------------------------------------------------------
## NUMB_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.1811  0.03246   0.1400   0.1473 
##      .25      .50      .75      .90      .95 
##   0.1593   0.1782   0.1972   0.2198   0.2354 
## 
## lowest : 0.1179985 0.1184910 0.1209077 0.1225590 0.1227794
## highest: 0.2805684 0.2809826 0.3052795 0.3064755 0.3165753
## ---------------------------------------------------------------------------
## P70S6_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.9431   0.1932   0.6826   0.7293 
##      .25      .50      .75      .90      .95 
##   0.8267   0.9313   1.0451   1.1842   1.2493 
## 
## lowest : 0.3441198 0.3592496 0.4711711 0.5831042 0.5895028
## highest: 1.4348178 1.4401663 1.5755662 1.6075684 1.6799532
## ---------------------------------------------------------------------------
## pGSK3B_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.1612  0.02121   0.1321   0.1371 
##      .25      .50      .75      .90      .95 
##   0.1493   0.1602   0.1717   0.1860   0.1950 
## 
## lowest : 0.09997585 0.10924965 0.11503639 0.11510337 0.11624730
## highest: 0.22691722 0.22779860 0.24349387 0.24430931 0.25321009
## ---------------------------------------------------------------------------
## pPKCG_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1    1.707   0.6596   0.7864   0.9501 
##      .25      .50      .75      .90      .95 
##   1.2968   1.6646   2.1130   2.5103   2.6893 
## 
## lowest : 0.5987666 0.6005737 0.6031650 0.6058048 0.6126829
## highest: 3.1968893 3.1993032 3.2440019 3.2715417 3.3819763
## ---------------------------------------------------------------------------
## CDK5_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.2924   0.0386   0.2330   0.2452 
##      .25      .50      .75      .90      .95 
##   0.2726   0.2938   0.3125   0.3321   0.3501 
## 
## lowest : 0.1811570 0.1975619 0.2030083 0.2049701 0.2050037
## highest: 0.3850295 0.3953077 0.4062779 0.4141955 0.8174018
## ---------------------------------------------------------------------------
## S6_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.4292   0.1562   0.2426   0.2733 
##      .25      .50      .75      .90      .95 
##   0.3167   0.4010   0.5349   0.6240   0.6699 
## 
## lowest : 0.1302063 0.1455924 0.1549423 0.1622186 0.1701055
## highest: 0.7806789 0.7870346 0.7877019 0.8073863 0.8226108
## ---------------------------------------------------------------------------
## ADARB1_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1    1.197   0.3942   0.7433   0.8165 
##      .25      .50      .75      .90      .95 
##   0.9305   1.1283   1.3802   1.6893   1.9296 
## 
## lowest : 0.5291078 0.5389338 0.5489707 0.5583358 0.5586451
## highest: 2.3835378 2.4026476 2.4047619 2.4675499 2.5398896
## ---------------------------------------------------------------------------
## AcetylH3K9_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.2165    0.168  0.07039  0.07959 
##      .25      .50      .75      .90      .95 
##  0.10357  0.15042  0.26965  0.37512  0.58872 
## 
## lowest : 0.05252842 0.05265137 0.05385404 0.05401287 0.05458515
## highest: 1.11472275 1.12909633 1.15685651 1.34237767 1.45938685
## ---------------------------------------------------------------------------
## RRP1_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.1666  0.02866   0.1338   0.1401 
##      .25      .50      .75      .90      .95 
##   0.1490   0.1621   0.1774   0.1968   0.2116 
## 
## lowest : -0.06200787  0.11689974  0.11897251  0.11897888  0.11915638
## highest:  0.34111544  0.35211451  0.39148040  0.42855903  0.61237703
## ---------------------------------------------------------------------------
## BAX_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.1793   0.0208   0.1469   0.1551 
##      .25      .50      .75      .90      .95 
##   0.1682   0.1807   0.1916   0.2021   0.2086 
## 
## lowest : 0.07232553 0.10230393 0.11203461 0.11329605 0.11648966
## highest: 0.22014093 0.22115783 0.22282524 0.22378610 0.24114105
## ---------------------------------------------------------------------------
## ARC_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.1215  0.01628  0.09864  0.10309 
##      .25      .50      .75      .90      .95 
##  0.11084  0.12163  0.13196  0.14017  0.14487 
## 
## lowest : 0.06725429 0.07544495 0.07680510 0.08379825 0.08439776
## highest: 0.15402073 0.15411897 0.15687826 0.15736561 0.15874781
## ---------------------------------------------------------------------------
## ERBB4_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1079        1   0.1565  0.01675   0.1325   0.1389 
##      .25      .50      .75      .90      .95 
##   0.1470   0.1564   0.1654   0.1757   0.1823 
## 
## lowest : 0.1002173 0.1058711 0.1079084 0.1083058 0.1103268
## highest: 0.1970899 0.1974948 0.1991830 0.2002162 0.2086975
## ---------------------------------------------------------------------------
## nNOS_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1079        1   0.1813  0.02796   0.1372   0.1475 
##      .25      .50      .75      .90      .95 
##   0.1665   0.1827   0.1986   0.2115   0.2194 
## 
## lowest : 0.09973436 0.10631596 0.10702494 0.11255309 0.11293103
## highest: 0.24577523 0.25297377 0.25599706 0.26029255 0.26073863
## ---------------------------------------------------------------------------
## Tau_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.2105   0.0691   0.1415   0.1485 
##      .25      .50      .75      .90      .95 
##   0.1680   0.1886   0.2339   0.3040   0.3493 
## 
## lowest : 0.09623279 0.09969325 0.10015519 0.11597875 0.11728308
## highest: 0.52240543 0.53088381 0.56712673 0.58043616 0.60276806
## ---------------------------------------------------------------------------
## GFAP_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1079        1   0.1209  0.01429   0.1000   0.1046 
##      .25      .50      .75      .90      .95 
##   0.1128   0.1205   0.1277   0.1365   0.1421 
## 
## lowest : 0.08611418 0.08766644 0.08783043 0.08821604 0.08853316
## highest: 0.17530140 0.17558779 0.17828298 0.19703050 0.21362064
## ---------------------------------------------------------------------------
## GluR3_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.2219  0.03932   0.1743   0.1827 
##      .25      .50      .75      .90      .95 
##   0.1957   0.2169   0.2460   0.2701   0.2844 
## 
## lowest : 0.1113821 0.1200208 0.1244546 0.1268955 0.1282082
## highest: 0.3148309 0.3182612 0.3203194 0.3203515 0.3310159
## ---------------------------------------------------------------------------
## GluR4_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1079        1   0.1266  0.02613  0.09306  0.09931 
##      .25      .50      .75      .90      .95 
##  0.10889  0.12355  0.14195  0.15542  0.16365 
## 
## lowest : 0.07257968 0.07329147 0.07336244 0.08034450 0.08215206
## highest: 0.18831047 0.18891397 0.25719720 0.42069863 0.53700410
## ---------------------------------------------------------------------------
## IL1B_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.5273  0.09175   0.3920   0.4222 
##      .25      .50      .75      .90      .95 
##   0.4756   0.5267   0.5770   0.6276   0.6629 
## 
## lowest : 0.2840013 0.2962466 0.3045522 0.3130310 0.3296246
## highest: 0.7703358 0.7761248 0.7831150 0.8423141 0.8897351
## ---------------------------------------------------------------------------
## P3525_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.2913  0.03395   0.2434   0.2542 
##      .25      .50      .75      .90      .95 
##   0.2701   0.2906   0.3116   0.3295   0.3415 
## 
## lowest : 0.2074378 0.2129542 0.2155300 0.2157518 0.2166631
## highest: 0.3625067 0.3651188 0.3655389 0.3667681 0.4437350
## ---------------------------------------------------------------------------
## pCASP9_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1    1.548   0.2774    1.191    1.248 
##      .25      .50      .75      .90      .95 
##    1.376    1.523    1.713    1.880    1.971 
## 
## lowest : 0.8531756 0.8974752 0.9157394 0.9458182 0.9664215
## highest: 2.4031145 2.4548915 2.5101213 2.5314108 2.5862159
## ---------------------------------------------------------------------------
## PSD95_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1    2.235   0.2844    1.799    1.910 
##      .25      .50      .75      .90      .95 
##    2.079    2.242    2.420    2.543    2.603 
## 
## lowest : 1.206098 1.321953 1.331925 1.345682 1.402583
## highest: 2.837367 2.844401 2.851444 2.851852 2.877873
## ---------------------------------------------------------------------------
## SNCA_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1079        1   0.1598  0.02688   0.1245   0.1306 
##      .25      .50      .75      .90      .95 
##   0.1428   0.1575   0.1733   0.1920   0.2056 
## 
## lowest : 0.1012332 0.1081962 0.1082343 0.1092731 0.1094067
## highest: 0.2352558 0.2366942 0.2429692 0.2512195 0.2576159
## ---------------------------------------------------------------------------
## Ubiquitin_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1    1.239   0.1967   0.9608   1.0097 
##      .25      .50      .75      .90      .95 
##   1.1163   1.2366   1.3631   1.4646   1.5112 
## 
## lowest : 0.7506641 0.8118216 0.8307528 0.8308095 0.8321190
## highest: 1.7328926 1.7407243 1.8393543 1.8936446 1.8972023
## ---------------------------------------------------------------------------
## pGSK3B_Tyr216_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.8488   0.1062   0.6839   0.7196 
##      .25      .50      .75      .90      .95 
##   0.7937   0.8499   0.9162   0.9662   0.9935 
## 
## lowest : 0.5773968 0.5819287 0.5977821 0.6051203 0.6062151
## highest: 1.1164285 1.1198982 1.1463778 1.1508555 1.2045981
## ---------------------------------------------------------------------------
## SHH_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.2267  0.03151   0.1867   0.1938 
##      .25      .50      .75      .90      .95 
##   0.2064   0.2240   0.2417   0.2609   0.2803 
## 
## lowest : 0.1558693 0.1626419 0.1655822 0.1698028 0.1725882
## highest: 0.3391304 0.3443132 0.3522055 0.3560009 0.3582888
## ---------------------------------------------------------------------------
## BAD_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      867      213      866        1   0.1579  0.03261   0.1189   0.1248 
##      .25      .50      .75      .90      .95 
##   0.1364   0.1523   0.1740   0.1984   0.2136 
## 
## lowest : 0.08830462 0.09236149 0.09768409 0.10076730 0.10223765
## highest: 0.24734918 0.25251201 0.25524157 0.26619343 0.28201635
## ---------------------------------------------------------------------------
## BCL2_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      795      285      795        1   0.1348  0.02942   0.1005   0.1058 
##      .25      .50      .75      .90      .95 
##   0.1156   0.1295   0.1482   0.1708   0.1859 
## 
## lowest : 0.08065685 0.08447381 0.08950891 0.09212676 0.09280994
## highest: 0.23393124 0.23802129 0.24309952 0.26094571 0.26150572
## ---------------------------------------------------------------------------
## pS6_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1   0.1215  0.01628  0.09864  0.10309 
##      .25      .50      .75      .90      .95 
##  0.11084  0.12163  0.13196  0.14017  0.14487 
## 
## lowest : 0.06725429 0.07544495 0.07680510 0.08379825 0.08439776
## highest: 0.15402073 0.15411897 0.15687826 0.15736561 0.15874781
## ---------------------------------------------------------------------------
## pCFOS_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1005       75     1005        1   0.1311  0.02587   0.1009   0.1051 
##      .25      .50      .75      .90      .95 
##   0.1135   0.1265   0.1437   0.1635   0.1747 
## 
## lowest : 0.08541915 0.09067696 0.09068140 0.09109544 0.09176119
## highest: 0.24039102 0.24271201 0.24535679 0.24768212 0.25652893
## ---------------------------------------------------------------------------
## SYP_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1079        1   0.4461  0.07471   0.3365   0.3622 
##      .25      .50      .75      .90      .95 
##   0.3981   0.4485   0.4908   0.5265   0.5530 
## 
## lowest : 0.2586258 0.2601463 0.2733639 0.2776650 0.2809251
## highest: 0.6059889 0.6362787 0.7221362 0.7406525 0.7595884
## ---------------------------------------------------------------------------
## H3AcK18_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      900      180      900        1   0.1696  0.06124   0.1027   0.1105 
##      .25      .50      .75      .90      .95 
##   0.1258   0.1582   0.1979   0.2366   0.2836 
## 
## lowest : 0.07969090 0.08236342 0.08530777 0.08678524 0.08685403
## highest: 0.45084043 0.45355352 0.45509434 0.46016787 0.47976327
## ---------------------------------------------------------------------------
## EGR1_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      870      210      870        1   0.1831  0.04362   0.1332   0.1402 
##      .25      .50      .75      .90      .95 
##   0.1551   0.1749   0.2045   0.2363   0.2597 
## 
## lowest : 0.1055372 0.1061346 0.1070948 0.1074335 0.1081293
## highest: 0.3230668 0.3261426 0.3371865 0.3408836 0.3606921
## ---------------------------------------------------------------------------
## H3MeK4_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      810      270      810        1   0.2054  0.06065   0.1345   0.1453 
##      .25      .50      .75      .90      .95 
##   0.1651   0.1940   0.2352   0.2833   0.3144 
## 
## lowest : 0.1017870 0.1049287 0.1126895 0.1138108 0.1151487
## highest: 0.3820815 0.3833175 0.3879541 0.3958202 0.4139027
## ---------------------------------------------------------------------------
## CaNA_N 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1080        0     1080        1    1.338   0.3632   0.8870   0.9511 
##      .25      .50      .75      .90      .95 
##   1.0814   1.3174   1.5858   1.7695   1.8668 
## 
## lowest : 0.5864788 0.6275812 0.6433178 0.6433882 0.6614039
## highest: 2.0744911 2.0770839 2.1038759 2.1155551 2.1297911
## ---------------------------------------------------------------------------
gene<-mice_dat

b).Find the missing value, then delete variables with more than 10% missing value and using mean imputation to deal with others.

N_NA<-sapply(mice_dat[,c(2:78)], function(x) sum(is.na(x)))
N_NA<-N_NA/dim(mice_dat)[1]
#remove variables with more than 10% missing values
gene<-gene[,!names(gene)%in%names(N_NA)[which(N_NA>=0.1)]]
#a simple function to perform mean imputation
NA2mean <- function(x) replace(x, is.na(x), mean(x, na.rm = TRUE))
gene[,c(2:73)]<-lapply(gene[,c(2:73)], NA2mean)

2.a). Perform PCA on imputed expression measurements

pca = prcomp(gene[,c(2:73)], center = TRUE)
#explained variance
expl.var <- pca$sdev^2/sum(pca$sdev^2)*100 # percent explained variance
cum.var<-cumsum(expl.var)
#Plot PC in ggplot2
dat<-data.frame(expl.var, cum.var)
dat$PC<-1:72

p0<-ggplot(dat, aes(x = PC, y = expl.var)) + geom_point()+geom_line()
p0<-p0+xlab("Principal Component")
p0<-p0+ylab("Cumulative Proportion of variance")
p0

p<-ggplot(dat, aes(x = PC, y = cum.var)) + geom_point()+geom_line()
p<-p+xlab("Principal Component")
p<-p+ylab("Cumulative Proportion of variance")
p

According to the scree plot, with around 19 PCs, the cumulative proportional variance reach to nearly 99%.

b). Scale the data and repeat the above process.

pca3 = prcomp(gene[,c(2:73)], center = TRUE, scale. = TRUE)
#explained variance
expl.var <- pca3$sdev^2/sum(pca3$sdev^2)*100 #percent explained variance
cum.var<-cumsum(expl.var)
#Plot PC in ggplot2
dat<-data.frame(expl.var, cum.var)
dat$PC<-1:72
p<-ggplot(dat, aes(x = PC, y = expl.var)) + geom_point()+geom_line()
p<-p+xlab("Principal Component")
p<-p+ylab("Proportion of variance")
p

#variance for each gene expression
t<-apply(gene[,c(2:73)], 2, var)
dat0<-data.frame(gene = names(gene[,c(2:73)]), variance = t)
ggplot(data = dat0, aes(x = gene, y = variance))+
      geom_point()+
      theme_classic()+
      theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 3))

After scaling the data, the explained proportion of variance in the first several PCs were significantly reduced compare to the one without scaling. The possible reason is that there are several outliers of genes with large variance, such as pCAMKII_N.

c).

According to the plot, overall, there is no significant good separation by PC1 or PC2 in recapturing homogeneous groups of behaviors, treatment, Genotype, and class. Most of the groups have overlapping observations in the PC space except for behavior. PC2 may have some separation in behavior.

d). i. Fit a logistic regression using PC scores

temp<-as.data.frame(pca$x)
temp$Genotype<-ifelse(gene$Genotype == "Control", 0, 1)
temp$Genotype<-as.factor(temp$Genotype)
fit <- glm(Genotype~., data = temp, family=binomial(link='logit'))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(fit)
## 
## Call:
## glm(formula = Genotype ~ ., family = binomial(link = "logit"), 
##     data = temp)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -1.511e-04  -2.100e-08  -2.100e-08   2.100e-08   1.588e-04  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  3.897e-01  1.543e+04   0.000    1.000
## PC1         -1.562e+00  7.093e+03   0.000    1.000
## PC2         -1.269e+01  1.279e+04  -0.001    0.999
## PC3          1.360e+02  3.351e+04   0.004    0.997
## PC4          7.691e+01  1.541e+04   0.005    0.996
## PC5         -1.502e+02  3.425e+04  -0.004    0.996
## PC6         -5.153e+01  4.216e+04  -0.001    0.999
## PC7         -4.360e+01  4.299e+04  -0.001    0.999
## PC8         -1.226e+02  1.139e+05  -0.001    0.999
## PC9         -1.736e+02  7.148e+04  -0.002    0.998
## PC10        -3.275e+02  1.262e+05  -0.003    0.998
## PC11        -2.359e+02  7.925e+04  -0.003    0.998
## PC12         1.227e+01  3.661e+05   0.000    1.000
## PC13        -3.005e+02  2.224e+05  -0.001    0.999
## PC14        -6.848e+01  1.650e+05   0.000    1.000
## PC15         4.152e+02  1.586e+05   0.003    0.998
## PC16         5.958e+02  2.142e+05   0.003    0.998
## PC17        -9.404e+02  1.591e+05  -0.006    0.995
## PC18         1.566e+03  1.710e+05   0.009    0.993
## PC19        -2.218e+02  1.845e+05  -0.001    0.999
## PC20        -5.634e+02  2.674e+05  -0.002    0.998
## PC21        -1.144e+02  1.855e+05  -0.001    1.000
## PC22        -1.732e+03  2.769e+05  -0.006    0.995
## PC23        -7.662e+02  3.274e+05  -0.002    0.998
## PC24         5.095e+01  3.684e+05   0.000    1.000
## PC25        -5.576e+02  1.314e+05  -0.004    0.997
## PC26        -6.791e+02  1.594e+05  -0.004    0.997
## PC27        -1.925e+03  3.401e+05  -0.006    0.995
## PC28        -3.653e+02  5.557e+05  -0.001    0.999
## PC29        -2.362e+03  6.380e+05  -0.004    0.997
## PC30         3.653e+02  2.749e+05   0.001    0.999
## PC31        -8.847e+02  7.635e+05  -0.001    0.999
## PC32        -1.008e+03  8.376e+05  -0.001    0.999
## PC33         1.410e+03  3.782e+05   0.004    0.997
## PC34         2.053e+03  4.193e+05   0.005    0.996
## PC35        -1.851e+03  1.187e+06  -0.002    0.999
## PC36         1.129e+03  9.199e+05   0.001    0.999
## PC37         2.644e+02  1.115e+06   0.000    1.000
## PC38         1.126e+03  3.975e+05   0.003    0.998
## PC39         3.968e+02  5.356e+05   0.001    0.999
## PC40        -2.572e+02  2.877e+05  -0.001    0.999
## PC41        -2.361e+03  1.105e+06  -0.002    0.998
## PC42         1.796e+03  6.255e+05   0.003    0.998
## PC43        -9.353e+02  6.501e+05  -0.001    0.999
## PC44         1.709e+03  7.840e+05   0.002    0.998
## PC45        -1.904e+03  8.120e+05  -0.002    0.998
## PC46         1.167e+03  1.315e+06   0.001    0.999
## PC47        -1.189e+03  8.850e+05  -0.001    0.999
## PC48         2.472e+03  5.507e+05   0.004    0.996
## PC49        -1.104e+03  6.358e+05  -0.002    0.999
## PC50        -1.643e+03  8.151e+05  -0.002    0.998
## PC51        -1.844e+03  1.223e+06  -0.002    0.999
## PC52         1.172e+03  1.557e+06   0.001    0.999
## PC53         2.261e+03  1.829e+06   0.001    0.999
## PC54         2.202e+02  9.896e+05   0.000    1.000
## PC55         1.963e+03  4.718e+05   0.004    0.997
## PC56         1.605e+03  1.430e+06   0.001    0.999
## PC57         2.142e+03  1.138e+06   0.002    0.998
## PC58        -1.695e+03  9.246e+05  -0.002    0.999
## PC59         3.720e+03  1.845e+06   0.002    0.998
## PC60        -2.772e+03  9.049e+05  -0.003    0.998
## PC61        -5.348e+02  1.433e+06   0.000    1.000
## PC62         3.329e+02  1.277e+06   0.000    1.000
## PC63         2.599e+03  2.416e+06   0.001    0.999
## PC64         9.139e+02  1.128e+06   0.001    0.999
## PC65         1.213e+03  4.147e+06   0.000    1.000
## PC66         7.976e+01  9.888e+05   0.000    1.000
## PC67         6.641e+02  1.570e+06   0.000    1.000
## PC68        -2.177e+03  1.751e+06  -0.001    0.999
## PC69         4.974e+02  3.631e+06   0.000    1.000
## PC70         6.282e+03  1.891e+06   0.003    0.997
## PC71         4.059e+02  2.078e+06   0.000    1.000
## PC72         9.403e+18  3.743e+21   0.003    0.998
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.4939e+03  on 1079  degrees of freedom
## Residual deviance: 4.4879e-07  on 1007  degrees of freedom
## AIC: 146
## 
## Number of Fisher Scoring iterations: 25

Because I have 72 varibles in my model, I am confronted to an over-fitting problem: too many variables in the model, leading to a perfect separation of genotypes. The consequences is that model likelihood is not defined, and thus I can’t get the model to converge.

  1. Using only meaningful PCs to fit the logistic models
library(pls)
## 
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
## 
##     loadings
fit2<-glm(Genotype~., data = temp[,c(1:19, 73)], family=binomial(link='logit'))
summary(fit2)
## 
## Call:
## glm(formula = Genotype ~ ., family = binomial(link = "logit"), 
##     data = temp[, c(1:19, 73)])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7811  -0.5693  -0.1208   0.5530   3.2466  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.238256   0.093173  -2.557 0.010554 *  
## PC1          0.190359   0.060989   3.121 0.001801 ** 
## PC2         -0.486991   0.081643  -5.965 2.45e-09 ***
## PC3          1.162662   0.141285   8.229  < 2e-16 ***
## PC4          0.093566   0.160263   0.584 0.559337    
## PC5         -2.387215   0.278931  -8.558  < 2e-16 ***
## PC6         -1.204505   0.289795  -4.156 3.23e-05 ***
## PC7         -1.116698   0.345625  -3.231 0.001234 ** 
## PC8         -1.529011   0.368779  -4.146 3.38e-05 ***
## PC9         -2.671859   0.417346  -6.402 1.53e-10 ***
## PC10        -3.190442   0.506785  -6.295 3.06e-10 ***
## PC11        -1.030280   0.607791  -1.695 0.090052 .  
## PC12        -0.071460   0.508266  -0.141 0.888190    
## PC13        -2.867598   0.690202  -4.155 3.26e-05 ***
## PC14         0.005547   0.611633   0.009 0.992763    
## PC15         5.959579   0.755457   7.889 3.05e-15 ***
## PC16         3.549099   0.836709   4.242 2.22e-05 ***
## PC17        -9.819356   1.118729  -8.777  < 2e-16 ***
## PC18        13.452681   1.129721  11.908  < 2e-16 ***
## PC19        -3.629204   1.003107  -3.618 0.000297 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1493.86  on 1079  degrees of freedom
## Residual deviance:  811.68  on 1060  degrees of freedom
## AIC: 851.68
## 
## Number of Fisher Scoring iterations: 6
gene$Genotype<-ifelse(gene$Genotype=="Control", 0, 1)
pcr_model <- pcr(Genotype~., data = gene[,c(2:74)], scale = TRUE, validation = "CV")
validationplot(pcr_model)

We found using the meaningful PCs only still have a good model of fit. By validation plot, we found RMSE for fitted model didn’t change a lot after more PCs are adding to the meaningful PCs.

  1. Clustering a).K-mediods clustering to determine the optimal k
clusters <- hclust(dist(gene[,2:73]))
plot(clusters, labels = FALSE)

From the cluster dendrogram, k = 2, 3 or 4 may be optimal.

pamk.best <- pamk(gene[,2:73])
cat("number of clusters estimated by optimum average silhouette width:", pamk.best$nc, "\n")
## number of clusters estimated by optimum average silhouette width: 3
plot(pam(gene[,2:73], pamk.best$nc))

By estimating silhouette width after partitioning, I found the optimal k is 3.

  1. Here, purity was calculated after comparing cluster group and real labels.
gene$fit.cluster<-cutree(clusters, 3)
sum0<-data.frame(Treatment = purity(gene$fit.cluster, gene$Treatment),
                 Genotype = purity(gene$fit.cluster, gene$Genotype),
                 Behavior = purity(gene$fit.cluster, gene$Behavior),
                 class = purity(gene$fit.cluster, gene$class))
sum0<-format(sum0, digits = 3)
row.names(sum0)<-"Purity"
htmlTable(sum0, caption = "Table 1. Purity after applying hierachial clustering with optimal k=3")
Table 1. Purity after applying hierachial clustering with optimal k=3
Treatment Genotype Behavior class
Purity 0.542 0.528 0.536 0.176
  1. To cluster protein, first we need to calculate correlation matrix and use 1-correlation as dissimlarity index:
cor_dis<-1-cor(gene[,c(2:73)])
dis<-as.dist(cor_dis)
plot(hclust(dis), 
     main="Dissimilarity = 1 - Correlation", xlab = "Protein", cex = 0.5)

b). Hierarchical Clustering:

  1. Cluster Method: Compare complete, single, and average linkage with 4 clusters
Hs <- hclust(dis, method = "single")
Ha <- hclust(dis, method = "average")
Hc <- hclust(dis, method = "complete")

plot(Hs, cex = 0.5)

plot(Ha, cex = 0.5)

plot(Hc, cex = 0.5)

#cluster assignment
assign<-data.frame(Single = cutree(Hs, k =4),
                   Average = cutree(Ha, k =4),
                   Complete = cutree(Hc, k =4))

table(assign$Single)
## 
##  1  2  3  4 
## 67  1  2  2
table(assign$Average)
## 
##  1  2  3  4 
## 48 17  3  4
table(assign$Complete)
## 
##  1  2  3  4 
## 20 25 15 12

Different methods of linkage has very different cluster assignments. For instance, when using single linkage, most of proteins were assigned to one cluster.

  1. Standardizing data: Here, I choose complete linkage
#standardize the data
gene_st<-as.data.frame(lapply(gene[,c(2:73)], function(x) (x-mean(x))/sd(x)))
cor_dis_st<-1-cor(gene_st)
dis_st<-as.dist(cor_dis_st)
Hc_st <- hclust(dis_st, method = "complete")
plot(Hc_st, cex = 0.5)

table(cutree(Hc_st, k = 4))
## 
##  1  2  3  4 
## 20 25 15 12

After standardizing the data of protein expression, the cluster assigment did not change since calculating correlation matrix is not impacted by scaling data.

  1. Distance matrix: Euclidean distance was used instead of correlation distance
Euc_matrix<-dist(t(gene[,2:73]), method = "euclidean")
dis_euc<-as.dist(Euc_matrix)
Hc_euc<-hclust(dis_euc, method = "complete")
plot(Hc_euc, cex = 0.5)

table(cutree(Hc_euc, k = 4))
## 
##  1  2  3  4 
## 56  4  2 10

Compare to using correlation distance as distance metrics, more proteins were assigned to one cluster after using Euclidean distance metrics.

  1. Sampling the data

## 
##  1  2  3  4 
##  9 24 26 13
Table 2. Cluster assignment after sampling (Repeats = 5)
Repeat:1 Repeat:2 Repeat:3 Repeat:4 Repeat:5
Cluster=1 9 26 34 29 13
Cluster=2 24 17 20 18 25
Cluster=3 26 17 15 16 13
Cluster=4 13 12 3 9 21

After repeating sampling process before applying clustering algorithm, we found the results of cluster assignment is similar to the one without sampling.

  1. comments on difference: Among the potential factors including linkage method, cluster distance metrics, sampling, and data scaling, I think linkage methods and distance function are the most important factors that impacts on cluster stability.